@@ -28,10 +28,10 @@ module Agents |
||
28 | 28 |
"extract": { |
29 | 29 |
"url": { "css": "#comic img", "value": "@src" }, |
30 | 30 |
"title": { "css": "#comic img", "value": "@title" }, |
31 |
- "body_text": { "css": "div.main", "value": "text()" } |
|
31 |
+ "body_text": { "css": "div.main", "value": ".//text()" } |
|
32 | 32 |
} |
33 | 33 |
|
34 |
- "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "text()" is to extract the enclosed text. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. |
|
34 |
+ "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "//text()" is to extract all the enclosed texts. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. Note that these functions take a string, not a node set, so what you may think would be written as `normalize-text(.//text())` should actually be `normalize-text(.)`. |
|
35 | 35 |
|
36 | 36 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
37 | 37 |
|
@@ -162,8 +162,6 @@ module Agents |
||
162 | 162 |
# Node#xpath() returns any numeric value as float; |
163 | 163 |
# convert it to integer as appropriate. |
164 | 164 |
value = value.to_i if value.to_i == value |
165 |
- when Nokogiri::XML::NodeSet |
|
166 |
- value = value.first |
|
167 | 165 |
end |
168 | 166 |
value.to_s |
169 | 167 |
} |
@@ -7,7 +7,7 @@ class AdoptXpathInWebsiteAgent < ActiveRecord::Migration |
||
7 | 7 |
agent.options['extract'].each { |name, extraction| |
8 | 8 |
case |
9 | 9 |
when extraction.delete('text') |
10 |
- extraction['value'] = 'text()' |
|
10 |
+ extraction['value'] = './/text()' |
|
11 | 11 |
when attr = extraction.delete('attr') |
12 | 12 |
extraction['value'] = "@#{attr}" |
13 | 13 |
end |
@@ -10,8 +10,8 @@ jane_website_agent: |
||
10 | 10 |
:expected_update_period_in_days => 2, |
11 | 11 |
:mode => :on_change, |
12 | 12 |
:extract => { |
13 |
- :title => {:css => "item title", :value => 'text()'}, |
|
14 |
- :url => {:css => "item link", :value => 'text()'} |
|
13 |
+ :title => {:css => "item title", :value => './/text()'}, |
|
14 |
+ :url => {:css => "item link", :value => './/text()'} |
|
15 | 15 |
} |
16 | 16 |
}.to_json.inspect %> |
17 | 17 |
|
@@ -769,7 +769,7 @@ describe AgentDrop do |
||
769 | 769 |
mode: 'on_change', |
770 | 770 |
extract: { |
771 | 771 |
url: { css: '[id^=strip_enlarged_] img', value: '@src' }, |
772 |
- title: { css: '.STR_DateStrip', value: 'text()' }, |
|
772 |
+ title: { css: '.STR_DateStrip', value: './/text()' }, |
|
773 | 773 |
}, |
774 | 774 |
}, |
775 | 775 |
schedule: 'every_12h', |
@@ -257,7 +257,6 @@ describe Agents::WebsiteAgent do |
||
257 | 257 |
'mode' => "on_change", |
258 | 258 |
'extract' => { |
259 | 259 |
'url' => {'css' => "#topLeft a", 'value' => "@href"}, |
260 |
- 'title' => {'css' => "#topLeft a", 'value' => "text()"} |
|
261 | 260 |
} |
262 | 261 |
} |
263 | 262 |
rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
@@ -287,6 +286,25 @@ describe Agents::WebsiteAgent do |
||
287 | 286 |
event.payload['num_links'].should == "9" |
288 | 287 |
end |
289 | 288 |
|
289 |
+ it "should return all texts concatenated if XPath returns many text nodes" do |
|
290 |
+ rel_site = { |
|
291 |
+ 'name' => "XKCD", |
|
292 |
+ 'expected_update_period_in_days' => 2, |
|
293 |
+ 'type' => "html", |
|
294 |
+ 'url' => "http://xkcd.com", |
|
295 |
+ 'mode' => "on_change", |
|
296 |
+ 'extract' => { |
|
297 |
+ 'slogan' => {'css' => "#slogan", 'value' => ".//text()"} |
|
298 |
+ } |
|
299 |
+ } |
|
300 |
+ rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
|
301 |
+ rel.user = users(:bob) |
|
302 |
+ rel.save! |
|
303 |
+ rel.check |
|
304 |
+ event = Event.last |
|
305 |
+ event.payload['slogan'].should == "A webcomic of romance, sarcasm, math, and language." |
|
306 |
+ end |
|
307 |
+ |
|
290 | 308 |
describe "JSON" do |
291 | 309 |
it "works with paths" do |
292 | 310 |
json = { |